Mine Common French Words from Wiktionary

Sites:



In [1]:

    
url_list = ["https://en.wiktionary.org/wiki/Wiktionary:French_frequency_lists/1-2000",
            "https://en.wiktionary.org/wiki/Wiktionary:French_frequency_lists/2001-4000",
            "https://en.wiktionary.org/wiki/Wiktionary:French_frequency_lists/4001-6000",
            "https://en.wiktionary.org/wiki/Wiktionary:French_frequency_lists/6001-8000",
            "https://en.wiktionary.org/wiki/Wiktionary:French_frequency_lists/8001-10000"
           ]



In [2]:

    
from urllib.request import urlopen
from lxml import html
import unicodedata



In [3]:

    
french_words = list()
french_words_set = set()



In [4]:

    
# Function to strip accents
def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
              if unicodedata.category(c) != 'Mn')

# Function to process a word
def process_french_word(word):
    return strip_accents(word.lower())



In [5]:

    
for url in url_list:
    page_html = urlopen(url).read()
    tree = html.fromstring(page_html)
    word_list = tree.xpath('.//div/table//tr//li/span/a')
    for w in word_list:
        word = w.text
        proc_word = process_french_word(word)
        if proc_word not in french_words_set:
            french_words_set.add(proc_word)
            french_words.append(proc_word)



In [6]:

    
# Write words to a text file
f_out = open("french.txt", 'w')
for word in french_words:
    f_out.write(word+"\n")
f_out.close()



In [ ]: